Introduction

This analysis examines simulated income data for Hungary, focusing on the relationships between income and various demographic factors including age, location, occupation, and gender. The dataset is simulated to reflect realistic patterns while maintaining a manageable size for analysis.

Data Simulation

Descriptive Statistics

# Summary statistics with kable
summary_stats <- summary(data)
kable(summary_stats, caption = "Summary Statistics of the Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Summary Statistics of the Dataset
age city occupation gender income
Min. : 0.00 Length:10000 Length:10000 Length:10000 Min. :-31517
1st Qu.:23.00 Class :character Class :character Class :character 1st Qu.:132102
Median :37.00 Mode :character Mode :character Mode :character Median :338604
Mean :38.01 NA NA NA Mean :315491
3rd Qu.:52.00 NA NA NA 3rd Qu.:485023
Max. :94.00 NA NA NA Max. :850812

Demographics Analysis

library(forcats)

# Create 2-year age bins starting from 0
data <- data %>%
  mutate(age_group = cut(age, 
                        breaks = seq(0, 100, by = 2), 
                        right = FALSE, 
                        include.lowest = TRUE, 
                        labels = seq(0, 98, by = 2)))

# Prepare data for detailed pyramid
dem_pyramid <- data %>%
  group_by(age_group, gender) %>%
  summarise(count = n(), .groups = 'drop') %>%
  mutate(count = ifelse(gender == "Male", -count, count))

# Plot detailed population pyramid
ggplot(dem_pyramid, aes(x = age_group, y = count, fill = gender)) +
  geom_bar(stat = "identity", width = 0.8, color = "black") +
  scale_y_continuous(labels = abs, expand = expansion(mult = c(0.05, 0.05))) +
  scale_fill_manual(values = c("Male" = "#00BFFF", "Female" = "#FF3B3B")) +
  coord_flip() +
  labs(title = "Population Pyramid of Simulated Hungarian Data",
       x = "Age Group",
       y = "Count",
       fill = "Gender") +
  custom_theme +
  theme(legend.position = "top",
        axis.text.y = element_text(size = 10, face = "bold"),
        plot.margin = margin(t = 20, r = 20, b = 20, l = 20))

# Income distribution by gender (working age only)
ggplot(data %>% filter(age <= 70 & age >= 18), aes(x = income, fill = gender)) +
  geom_density(alpha = 0.6) +
  scale_fill_viridis_d() +
  labs(title = "Income Distribution by Gender",
       subtitle = "(working age only)",
       x = "Income (HUF)",
       y = "Density") +
  custom_theme

# Income by occupation with jitter (working age only)
ggplot(data %>% filter(age <= 70 & age >= 18), aes(x = reorder(occupation, income, FUN = median), y = income, color = occupation)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(alpha = 0.1, width = 0.2) +
  scale_color_viridis_d() +
  coord_flip() +
  labs(title = "Income Distribution by Occupation",
       subtitle = "(working age only)",
       x = "Occupation",
       y = "Income (HUF)") +
  custom_theme

# Income by city with violin plot (working age only)
ggplot(data %>% filter(age <= 70 & age >= 18), aes(x = reorder(city, income, FUN = median), y = income, fill = city)) +
  geom_violin(alpha = 0.7) +
  geom_boxplot(width = 0.2, alpha = 0.5) +
  scale_fill_viridis_d() +
  coord_flip() +
  labs(title = "Income Distribution by City",
       subtitle = "(working age only)",
       x = "City",
       y = "Income (HUF)") +
  custom_theme

# Age vs income scatter plot with regression line (working age only)
ggplot(data %>% filter(age <= 70 & age >= 18), aes(x = age, y = income, color = gender)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", se = TRUE) +
  scale_color_viridis_d() +
  labs(title = "Relationship between Age and Income",
       subtitle = "(working age only)",
       x = "Age",
       y = "Income (HUF)") +
  custom_theme

# Calculate mean income by category (working age only)
income_by_category <- data %>%
  filter(age <= 70 & age >= 18) %>%
  group_by(occupation, city, gender) %>%
  summarise(
    mean_income = mean(income),
    count = n(),
    .groups = 'drop'
  ) %>%
  arrange(desc(mean_income))

# Create a heatmap of mean income by occupation and city
ggplot(income_by_category, aes(x = city, y = occupation, fill = mean_income)) +
  geom_tile() +
  scale_fill_viridis(name = "Mean Income (HUF)") +
  facet_wrap(~gender) +
  labs(title = "Mean Income by Occupation, City, and Gender",
       subtitle = "(working age only)",
       x = "City",
       y = "Occupation") +
  custom_theme +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text = element_text(face = "bold"))

# Create a summary table of the top 10 highest earning combinations (working age only)
top_earners <- income_by_category %>%
  arrange(desc(mean_income)) %>%
  head(10)

kable(top_earners, 
      caption = "Top 10 Highest Earning Combinations",
      digits = 0) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE)
Top 10 Highest Earning Combinations
occupation city gender mean_income count
Software Developer Budapest Male 568530 112
Software Developer Debrecen Male 542354 46
Software Developer Budapest Female 535991 108
Doctor Budapest Female 522413 70
Software Developer Debrecen Female 517588 48
Manager Budapest Female 516757 100
Doctor Budapest Male 516183 58
Engineer Budapest Male 512031 132
Software Developer Szeged Male 509836 25
Software Developer Eger Male 499559 11

Hypothesis Testing

Parametric Tests

1. Gender Income Difference (t-test)

# Test if there's a significant difference in income between genders
t_test_result <- t.test(income ~ gender, data = data)
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  income by gender
## t = -2.2421, df = 9988.2, p-value = 0.02497
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -17644.966  -1183.765
## sample estimates:
## mean in group Female   mean in group Male 
##             310770.4             320184.7

2. City Income Differences (ANOVA)

# Test if there are significant differences in income between cities
city_anova <- aov(income ~ city, data = data)
print(summary(city_anova))
##               Df    Sum Sq   Mean Sq F value Pr(>F)    
## city           7 1.186e+13 1.694e+12   39.45 <2e-16 ***
## Residuals   9992 4.291e+14 4.295e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3. Occupation Income Differences (ANOVA)

# Test if there are significant differences in income between occupations
occupation_anova <- aov(income ~ occupation, data = data)
print(summary(occupation_anova))
##               Df    Sum Sq  Mean Sq F value Pr(>F)    
## occupation     9 1.242e+13 1.38e+12   32.18 <2e-16 ***
## Residuals   9990 4.286e+14 4.29e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Non-parametric Tests

1. Gender Income Distribution (Kolmogorov-Smirnov Test)

# Test if income distributions differ between genders
male_income <- data$income[data$gender == "Male"]
female_income <- data$income[data$gender == "Female"]
ks_test <- ks.test(male_income, female_income)
print(ks_test)
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  male_income and female_income
## D = 0.033246, p-value = 0.007961
## alternative hypothesis: two-sided

2. Income Distribution by City (Kruskal-Wallis Test)

# Test if income distributions differ between cities
kruskal_test <- kruskal.test(income ~ city, data = data)
print(kruskal_test)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  income by city
## Kruskal-Wallis chi-squared = 276.84, df = 7, p-value < 2.2e-16

Regression Analysis

Multiple Linear Regression

# Convert categorical variables to factors
data_working_age <- data %>% filter(age <= 70 & age >= 18)
data_working_age$city <- as.factor(data_working_age$city)
data_working_age$occupation <- as.factor(data_working_age$occupation)
data_working_age$gender <- as.factor(data_working_age$gender)

# Fit the model
model1 <- lm(income ~ age + I(age^2) + city + occupation + gender, data = data_working_age)

# Create a beautiful model summary table
model_summary <- summary(model1)
kable(tidy(model_summary), caption = "Multiple Linear Regression Results (Working Age Only)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Multiple Linear Regression Results (Working Age Only)
term estimate std.error statistic p.value
(Intercept) -103304.3227 5294.911397 -19.51011 0
age 17996.5194 250.036068 71.97569 0
I(age^2) -100.0189 2.895668 -34.54088 0
cityDebrecen -50555.7719 1708.534326 -29.59014 0
cityEger -105861.9735 2441.761341 -43.35476 0
cityGyőr -100901.8196 2201.479335 -45.83364 0
cityMiskolc -98022.3650 1990.245618 -49.25139 0
cityPécs -104132.3473 2195.486122 -47.43020 0
citySzeged -54481.4626 1911.615619 -28.50022 0
citySzombathely -97368.4232 2474.808146 -39.34383 0
occupationChef -34050.3543 2937.191518 -11.59283 0
occupationDoctor 63015.0930 3036.507567 20.75249 0
occupationDriver -45548.0684 2979.646552 -15.28640 0
occupationEngineer 27022.3777 2381.063430 11.34887 0
occupationManager 44518.4166 2656.314580 16.75947 0
occupationNurse -24868.5619 2271.901452 -10.94614 0
occupationSales Representative -55848.7747 2030.431916 -27.50586 0
occupationSoftware Developer 92726.8736 2575.317522 36.00600 0
occupationTeacher -15803.4550 2157.111990 -7.32621 0
genderMale 20841.9232 1119.996419 18.60892 0
# Enhanced model diagnostics
par(mfrow = c(2,2))
plot(model1, col = income_palette[1], pch = 19, cex = 0.7)

Polynomial Regression for Age-Income Relationship

# Fit polynomial regression
model2 <- lm(income ~ poly(age, 3), data = data_working_age)

# Create a static plot
ggplot(data_working_age, aes(x = age, y = income)) +
  geom_point(alpha = 0.1, color = income_palette[1]) +
  geom_smooth(method = "lm", formula = y ~ poly(x, 3), 
              color = income_palette[5], fill = income_palette[5], alpha = 0.2) +
  labs(title = "Polynomial Regression: Age vs Income",
       subtitle = "Cubic polynomial fit with confidence interval (Working Age Only)",
       x = "Age",
       y = "Income (HUF)") +
  custom_theme

# Model comparison
model_comparison <- data.frame(
  Model = c("Multiple Linear", "Polynomial"),
  R_squared = c(summary(model1)$r.squared, summary(model2)$r.squared),
  Adj_R_squared = c(summary(model1)$adj.r.squared, summary(model2)$adj.r.squared)
)

kable(model_comparison, caption = "Model Comparison (Working Age Only)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Model Comparison (Working Age Only)
Model R_squared Adj_R_squared
Multiple Linear 0.9000774 0.8998350
Polynomial 0.7386924 0.7385925

Predictions

# Create a sample prediction
new_data <- data.frame(
  age = 35,
  city = "Budapest",
  occupation = "Software Developer",
  gender = "Male"
)

# Predict income
prediction <- predict(model1, newdata = new_data, interval = "prediction")
print(prediction)
##        fit      lwr      upr
## 1 517619.5 420425.2 614813.8

Summary and Conclusions

The analysis of the simulated Hungarian income data reveals several interesting patterns:

  1. There is a significant gender pay gap, with males earning more on average than females.
  2. Income varies significantly across different cities, with Budapest showing the highest average income.
  3. Occupation has a strong effect on income, with software developers and doctors earning the most.
  4. Age shows a non-linear relationship with income, peaking in the middle age range.
  5. The regression models explain a significant portion of the income variation.

The analysis demonstrates the complex interplay between various factors affecting income levels in Hungary. The findings suggest that both demographic factors (age, gender) and professional factors (occupation, location) play important roles in determining income levels.

References

  • R Core Team (2023). R: A language and environment for statistical computing.
  • Wickham, H. (2016). ggplot2: Elegant Graphics for Data Analysis.
  • Fox, J. (2016). Applied Regression Analysis and Generalized Linear Models.